; Bilinear filtering and real Phongs shading pararell.
; file contains sse and avx versions
; Implemented in FASM by Maciej Guba.
; http://macgub.co.pl
;include 'labs.inc'
ROUND2 equ 10
;----first stage of tri proc - look at '3glass_tex' file
r_phg_bf_line_z:
if 1
; in:
;    xmm0 = normal vector 1
;    xmm1 = normal vect 2
;    xmm3 = lo -> hi tx1, ty1, z1 coords as dwords float
;    xmm5 = lo -> hi tx2, ty2, z2 coords as dwords float
;    xmm2 = lo -> hi y_min, y_max, x_min, x_max
;           as dword integers
;    mm0  = color
;    eax  = x1
;    ebx  = x2
;    ecx  = y
;    edi  = screen buffer
;    esi  = z buffer filled with dd floats
;    edx  = texture pointer (handle)
;    xmm6 = lo -> hi dword x_res, tex_shift, tex_x * 4,
;          tex size as dword integers

     push   ebp
     mov    ebp,esp
     sub    esp,256+31+20
     and    ebp,0xfffffff0
     sub    ebp,144

    .n1        equ [ebp-16]
    .x_res     equ [ebp-32]
    .tex_shift equ [ebp-28]
    .tex_x4    equ [ebp-24]
    .tex_size  equ [ebp-20]

    .tex_m2 equ [ebp-48]
    .lx1    equ [ebp-52]
    .lx2    equ [ebp-56]
    .x_resr equ [ebp-60]
    .y      equ [ebp-64]
    .screen equ [ebp-68]
    .zbuff  equ [ebp-72]
    .x_max  equ [ebp-74]
    .x_min  equ [ebp-76]
    .y_max  equ [ebp-78]
    .y_min  equ [ebp-80]
    .dn     equ [ebp-96]
    .yd     equ [ebp-100]
    .xd     equ [ebp-104]
    .yf     equ [ebp-108]
    .xf     equ [ebp-112]
    .cnv    equ [ebp-128]

    .z1     equ [ebp+8]
    .ty1    equ [ebp+4]
    .tx1    equ [ebp]
    .z2     equ [ebp+24]
    .ty2    equ [ebp+20]
    .tx2    equ [ebp+16]
   ;.cz     equ [ebp+40]
   ;.cty    equ [ebp+36]
    .col    equ [ebp+32]
    .dz     equ [ebp+72]
    .dty    equ [ebp+68]
    .dtx    equ [ebp+64]
    .the_one   equ [ebp+80]
    .tx_ptr    equ [ebp+84]
    .mask_255f equ [ebp+96]
    .lightSt   equ [ebp+112]

        mov      .y,ecx
        packssdw xmm2,xmm2
        movlps   .y_min,xmm2
        movaps   .x_res,xmm6
        mov      ecx,.tex_x4
        shr      ecx,2
        dec      ecx
        cvtsi2ss xmm6,ecx
        shufps   xmm6,xmm6,0
        movaps   xmm7,[mask_255f]
        movaps   .tex_m2,xmm6
        movaps   .mask_255f,xmm7
        movaps   xmm7,[the_one]
        movlps   .the_one,xmm7
        movaps   .n1,xmm0

        mov      .lx1,eax
        mov      .lx2,ebx

        movaps   .tx1,xmm3
        movaps   .tx2,xmm5
        mov      .tx_ptr,edx
        sub      ebx,eax
        cvtsi2ss xmm7,ebx
        rcpss    xmm7,xmm7
        shufps   xmm7,xmm7,0
        subps    xmm1,xmm0
        mulps    xmm1,xmm7
        movaps   .dn,xmm1
        subps    xmm5,xmm3
        mulps    xmm5,xmm7
        movaps   .dtx,xmm5

        mov      ebx,.lx1
        cmp      bx,.x_min     ; clipping on function4
        jge      @f
        movzx    eax,word .x_min
        sub      eax,ebx
        cvtsi2ss xmm7,eax
        shufps   xmm7,xmm7,0
        mulps    xmm5,xmm7
        mulps    xmm1,xmm7
        addps    xmm5,.tx1
        addps    xmm1,.n1
        movsx    eax,word .x_min
        movaps   .tx1,xmm5
        movaps   .n1,xmm1
        mov      dword .lx1,eax
      @@:
        xorps     xmm0,xmm0
        movq2dq   xmm2,mm0
        punpcklbw xmm2,xmm0
        punpcklwd xmm2,xmm0
        cvtdq2ps  xmm2,xmm2
        movaps    .col,xmm2
        movzx     eax,word .x_max
        mov       ecx,.lx2
        cmp       .lx2,eax
        cmova     ecx,eax
        mov       .lx2,ecx
        mov       eax,.x_res
        btr       eax,31
        mov       .x_resr,eax
        mul       dword .y
        add       eax,.lx1
        shl       eax,2
        add       edi,eax
        add       esi,eax
        mov       ecx,.lx2
        sub       ecx,.lx1
        cld
   .ddraw:
        movaps    xmm2,.tx1
        push      ecx
        movhlps   xmm7,xmm2
        cmpnltss  xmm7,dword[esi]
        movd      eax,xmm7
        or        eax,eax
        jnz       .skip
        xorps     xmm4,xmm4
        cmp       [rph_bump_flag],0
        je        @f
        call      calc_r_phg_bumps
      @@:
        xorps     xmm5,xmm5
        movhlps   xmm7,xmm2
        movss     [esi],xmm7
        movaps    xmm7,.n1   ; xmm0
        addps     xmm7,xmm4

   if 0
        cvtps2dq xmm6,xmm2
        sub      esp,8
        movlps   [esp],xmm6
        pop      eax ebx
        mov      cl,.tex_shift
        shl      ebx,cl
        add      eax,ebx
        and      eax,.tex_size
        shl      eax,2
        add      eax,.tx_ptr
     ;  movd     xmm6,[eax] ; mapp normal attempt
        pmovzxbd xmm6,[eax] ; xmm6
        cvtdq2ps xmm6,xmm6
        movaps   xmm4,xmm6
        dpps     xmm6,xmm6,01110111b
        rsqrtps  xmm6,xmm6
        mulps    xmm4,xmm6
        addps    xmm7,xmm4
  end if
        dpps      xmm7,xmm7,01111111b
        ; normalize
        rsqrtps   xmm7,xmm7
        mulps     xmm7,.n1   ; xmm0
        movaps    .cnv,xmm7

        movaps    xmm6,xmm2
        minps     xmm6,.tex_m2  ;    float  TEX_X-2,TEX_Y-2
        cvttps2dq xmm7,xmm6
        cvtdq2ps  xmm4,xmm7
        subps     xmm6,xmm4
        movlps    .xf,xmm6
        xor       ecx,ecx
        mov       eax,lights_aligned    ; global
      .ag_l:
        push     ecx
        movaps   xmm2,.tx1
        cmp      [stencil_s_flag],1
        jne      .no_shd
        or       ecx,ecx
        jne      .no_shd
        imul     ecx,44
        mov      ebx,ecx
        add      ebx,shd_stencil_mx_A ;+44  ; check only two lights
        cmp      dword[ebx],-1
        je       .no_shd      ; light vector prependicular to screen
        push     esi edi
        movhlps  xmm1,xmm2
        mov      edi,.y
        imul     edi,.x_resr  ; reseted small texture bit
        add      edi,.lx1
        add      ecx,stencil_shd_A_light_ptr
        xor      esi,esi      ; additional ptr == 0
        cmp      [ecx],esi    ; non allocated mem
        je       .no_shd      ; prevention..
        shl      edi,2
        add      edi,[ecx]
        xorps    xmm0,xmm0
        cmpnltss xmm1,dword[edi]
        pop      edi esi
        movd     ecx,xmm1
        or       ecx,ecx
        jnz      .in_shd
     .no_shd:
        xorps    xmm1,xmm1
        movaps   xmm0,[eax] ; calc multple lights
        pcmpeqd  xmm2,xmm2
        mulps    xmm0,.cnv  ;.lv  ; last dword should be zeroed
        psrld    xmm2,1
        haddps   xmm0,xmm0
        haddps   xmm0,xmm0
        andps    xmm0,xmm2  ;[abs_mask]  ;calc absolute value
        maxps    xmm0,xmm1
        movaps   xmm1,xmm0
        mulps    xmm0,[eax+16]
        mulps    xmm1,xmm1
        mulps    xmm1,xmm1
        mulps    xmm1,xmm1
        mulps    xmm1,xmm1
        mulps    xmm1,[eax+48]
        addps    xmm0,xmm1
    .in_shd:
        maxps    xmm5,xmm0
        add      eax,64
        pop      ecx
        inc      ecx
        cmp      eax,lights_aligned_end
        jnz      .ag_l
        minps    xmm5,.mask_255f
        cmp      [draw_flag],15
        jne      @f
        mulps    xmm5,.col
        jmp      .phg_col
      @@:
        ;texture coords work
        sub      esp,8
        movlps   [esp],xmm7
        pop      eax ebx
        mov      cl,.tex_shift
        shl      ebx,cl
        add      eax,ebx
        and      eax,.tex_size
        shl      eax,2
        add      eax,.tx_ptr
        mov      ebx,eax
        add      ebx,.tex_x4
        movlps   xmm7,[eax]
        movlps   xmm6,[ebx]
        movlps   xmm1,.xf
        call     bi_filter
  ;      movaps   xmm2,xmm7  
  ;      movaps   xmm0,xmm7
  ;      dpps     xmm2,xmm2,01110111b
  ;      rsqrtps  xmm2,xmm2
  ;      mulps    xmm0,xmm2
  ;      mulps    xmm0,.cnv
  ;      divps    xmm7,xmm0
        mulps    xmm5,xmm7
      .phg_col:
        cvtps2dq xmm5,xmm5
        psrld    xmm5,8
        packssdw xmm5,xmm5
        packuswb xmm5,xmm5
        movss    [edi],xmm5
    .skip:
        pop      ecx
        add      edi,4
        add      esi,4
        movaps   xmm2,.tx1
        movaps   xmm0,.n1     ; cur normal
        addps    xmm2,.dtx
        addps    xmm0,.dn
        movaps   .tx1,xmm2
        inc      dword .lx1
        movaps   .n1,xmm0
        ;loop     .ddraw
        dec      ecx
        jnz      .ddraw
  .end_line:
        add      esp,256+31+20
        pop      ebp

ret
;================================================================
 bi_filter:
 ;     in:  xm7 - first pack r1g1b1XXr2g2b2XX as bytex
 ;          xm6 - sec        r3g3b3XXr4g4b4XX as bytes
 ;          xm1 - xf, yf
 ;          edx - .xf address
 ;     out: xm7: r, g, b, as 32 bit floats
 ;
 ; uses:  xm7,6,3,4,1
        push      eax
        punpcklbw xmm7,xmm6
        ; xm7: r1, r3, g1, g3, b1, b3, 00, 00 :: r2, r4, g2, g4, b2, b4, 00, 00  as bytes
        movhlps   xmm6,xmm7
        punpcklbw xmm7,xmm6
        mov       eax,1.0
        ; xm7: r1, r2, r3, r4, g1, g2, g3, g4, b1, b2, b3, b4 00, 00, 00, 00 as bytes
        movaps    xmm6,xmm7
        movhlps   xmm4,xmm7
        psrldq    xmm6,4

        push      eax eax

        pmovzxbd  xmm4,xmm4
        pmovzxbd  xmm6,xmm6
        pmovzxbd  xmm7,xmm7

        ; calc w .........
        movlps    xmm3,[esp]  ;  broadcasted dword 1.0
        cvtdq2ps  xmm7,xmm7
        ; movaps   xmm1,.xf
        subps     xmm3,xmm1 ;[xf]
        cvtdq2ps  xmm6,xmm6
        movlhps   xmm3,xmm1 ;[xf]
        cvtdq2ps  xmm4,xmm4
        movaps    xmm1,xmm3  ; 1-xf, 1-yf, xf, yf
        shufps    xmm3,xmm3,10001000b
        shufps    xmm1,xmm1,11110101b
        mulps     xmm3,xmm1
        add       esp,8
        dpps      xmm7,xmm3,11110001b
        dpps      xmm6,xmm3,11110010b
        dpps      xmm4,xmm3,11110100b
        orps      xmm7,xmm6
        pop       eax
        orps      xmm7,xmm4
ret
end if
;=========================================================================
if 0
;r_phg_bf_line_z_avx:
; in:
;    xmm0 - normal vector 1
;    xmm1 - normal vect 2
;    xmm3 - lo -> hi tx1, ty1, z1 coords as dwords float
;    xmm5 - lo -> hi tx2, ty2, z2 coords as dwords float
;    xmm2 - lo -> hi y_min, y_max, x_min, x_max
;           as dword integers
;    eax - x1
;    ebx - x2
;    ecx - y
;    edi - screen buffer
;    esi - z buffer filled with dd floats
;    edx - texture pointer (handle)
;    xmm6 - lowest dword x_res as integer

   push  ebp
   mov   ebp,esp
   sub   esp,256+15
   and   ebp,0xfffffff0
   sub   ebp,128

 .n1     equ [ebp-16]
 .n2     equ [ebp-32]
 .tex_m2 equ [ebp-48]
 .lx1    equ [ebp-52]
 .lx2    equ [ebp-56]
 .x_res  equ [ebp-60]
 .y      equ [ebp-64]
 .screen equ [ebp-68]
 .zbuff  equ [ebp-72]
 .x_max  equ [ebp-74]
 .x_min  equ [ebp-76]
 .y_max  equ [ebp-78]
 .y_min  equ [ebp-80]
 .dn     equ [ebp-96]
 .yd     equ [ebp-100]
 .xd     equ [ebp-104]
 .yf     equ [ebp-108]
 .xf     equ [ebp-112]
 .cnv    equ [ebp-128]


 .z1     equ [ebp+8]
 .ty1    equ [ebp+4]
 .tx1    equ [ebp]
 .z2     equ [ebp+24]
 .ty2    equ [ebp+20]
 .tx2    equ [ebp+16]
 .cz     equ [ebp+40]
 .cty    equ [ebp+36]
 .ctx    equ [ebp+32]
 .dz     equ [ebp+72]
 .dty    equ [ebp+68]
 .dtx    equ [ebp+64]
 .tx_ptr equ [ebp+80]
 .the_one   equ [ebp+84]
 .mask_255f equ [ebp+96]



        mov       .y,ecx
        vpackssdw xmm2,xmm2,xmm2
        vmovss    .x_res,xmm6
        vmovlps   .y_min,xmm2
        cmp       cx,.y_min
        jl        .end_line
        cmp       cx,.y_max
        jge       .end_line          ;

        cmp       eax,ebx
        je        .end_line
        jl        .no_sort
        xchg      eax,ebx
        vmovaps   xmm7,xmm0
        vmovaps   xmm6,xmm3
        vmovaps   xmm0,xmm1
        vmovaps   xmm3,xmm5
        vmovaps   xmm1,xmm7
        vmovaps   xmm5,xmm6
   .no_sort:
        cmp       ax,.x_max
        jge       .end_line
        cmp       bx,.x_min
        jle       .end_line
        vmovaps   xmm4,[tex_m2]
        vmovaps   xmm7,[mask_255f]
        vmovaps   .tex_m2,xmm4
        vmovaps   .mask_255f,xmm7
        vmovaps   xmm7,[the_one]
        vmovlps   .the_one,xmm7
        vmovaps   .n1,xmm0
        vmovaps   .n2,xmm1
        mov       .lx1,eax
        mov       .lx2,ebx
        vmovaps    .tx1,xmm3
        vmovaps   .tx2,xmm5
        mov       .tx_ptr,edx
        sub       ebx,eax
        vcvtsi2ss xmm7,xmm7,ebx
        vrcpss    xmm7,xmm7,xmm7
        vshufps   xmm7,xmm7,xmm7,0
        vsubps    xmm1,xmm1,xmm0
        vmulps    xmm1,xmm1,xmm7
        vmovaps   .dn,xmm1
        vsubps    xmm5,xmm5,xmm3
        vmulps    xmm5,xmm5,xmm7
        vmovaps   .dtx,xmm5



        mov       ebx,.lx1
        cmp       bx,.x_min     ; clipping on function4
        jge       @f
        movzx     eax,word .x_min
        sub       eax,ebx
        vcvtsi2ss xmm7,xmm7,eax
        vshufps   xmm7,xmm7,xmm7,0
        vmulps    xmm5,xmm5,xmm7
        vmulps    xmm1,xmm1,xmm7
        vaddps    xmm5,xmm5,.tx1
        vaddps    xmm1,xmm1,.n1
        movsx     eax,word .x_min
        vmovaps   .tx1,xmm5
        vmovaps   .n1,xmm1
        mov       dword .lx1,eax
      @@:
        movzx     eax,word .x_max
        cmp       .lx2,eax
        jl        @f
        mov       .lx2,eax
      @@:
        mov       eax,.x_res
        mul       dword .y
        add       eax,.lx1
        shl       eax,2
        add       edi,eax
        add       esi,eax

        mov       ecx,.lx2
        sub       ecx,.lx1
        vmovaps   xmm2,.tx1
        cld
   .ddraw:
        vmovhlps  xmm7,xmm7,xmm2
        vcmpnltss xmm7,xmm7,dword[esi]
        vmovmskps eax,xmm7
        bt        eax,0
        jc        .skip

        vxorps   xmm4,xmm4,xmm4
        cmp      [rph_bump_flag],0
        je       @f
        call     calc_r_phg_bumps
      @@:
        vxorps   xmm5,xmm5,xmm5
        vmovhlps xmm7,xmm7,xmm2
        vmovss   [esi],xmm7
        vmovaps  xmm7,.n1
        vaddps   xmm7,xmm7,xmm4
        vmulps   xmm7,xmm7,xmm7 ; normalize
        vhaddps  xmm7,xmm7,xmm7
        vhaddps  xmm7,xmm7,xmm7
        vrsqrtps xmm7,xmm7
        vmulps   xmm7,xmm7,.n1
        vmovaps  .cnv,xmm7

        vmovaps    xmm6,xmm2
        vminps     xmm6,xmm6,.tex_m2  ;    float  TEX_X-2,TEX_Y-2
        vcvttps2dq xmm7,xmm6
        vcvtdq2ps  xmm4,xmm7
        vsubps     xmm6,xmm6,xmm4
        vmovlps    .xf,xmm6

        mov       eax,lights_aligned   ; global
      @@:
        vxorps    xmm1,xmm1,xmm1
        vmovaps   xmm0,[eax] ; calc multple lights
        vmulps    xmm0,xmm0,.cnv  ; last dword should be zeroed
        vhaddps   xmm0,xmm0,xmm0
        vhaddps   xmm0,xmm0,xmm0
   ;     andps    xmm0,[abs_val]  ; calc absolute value
        vmaxps    xmm0,xmm0,xmm1
        vmovaps   xmm1,xmm0
        vmulps    xmm0,xmm0,[eax+16]
        vmulps    xmm1,xmm1,xmm1
        vmulps    xmm1,xmm1,xmm1
        vmulps    xmm1,xmm1,xmm1
        vmulps    xmm1,xmm1,xmm1
        vmulps    xmm1,xmm1,[eax+48]
        vaddps    xmm0,xmm0,xmm1
        vmaxps    xmm5,xmm5,xmm0
        add       eax,64
        cmp       eax,lights_aligned_end
        jnz       @b
        vminps    xmm5,xmm5,.mask_255f

          ; texture coords work
        sub       esp,8
        vmovlps   [esp],xmm7
        pop       eax ebx
        shl       ebx,TEX_SHIFT
        add       eax,ebx
        shl       eax,2
        add       eax,.tx_ptr
        mov       ebx,eax
        add       ebx,TEX_X*4

        vmovups   xmm7,[eax]
        vmovups   xmm6,[ebx]
        vmovups   xmm1,.xf
        call      bi_filter_avx2
        vmulps    xmm5,xmm5,xmm7
        vcvtps2dq xmm5,xmm5
        vpsrld    xmm5,xmm5,8
        vpackssdw xmm5,xmm5,xmm5
        vpackuswb xmm5,xmm5,xmm5
        vmovd     [edi],xmm5
     .skip:
        add       edi,4
        add       esi,4
        vmovaps   xmm0,.n1     ; cur normal
        vaddps    xmm0,xmm0,.dn
        vaddps    xmm2,xmm2,.dtx
        vmovaps   .n1,xmm0
  ;      loop     .ddraw
        dec       ecx
        jnz       .ddraw

  .end_line:
        add       esp,256+15
        pop       ebp

ret

;============================================================
bi_filter_avx2:
 ;     in: xm7 - first pack r1g1b1XXr2g2b2XX as bytex
 ;         xm6 - sec        r3g3b3XXr4b4g4XX as bytes
 ;         xm1 - xf, yf
 ;         edx - .xf adress
 ;     out: xm7: r, g, b, as 32 bit floats
 ;
 ; uses:  ym7,6,3,1
        vmovlhps   xmm7,xmm7,xmm6
        vpshufb    xmm7,xmm7,[shuf_bifil]  ;     db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14
        vmovhlps   xmm6,xmm6,xmm7
        vpmovzxbd  ymm6,xmm6
        vpmovzxbd  ymm7,xmm7
     ; calc w .........
        vmovups    xmm3,[the_one]  ;  broadcasted dword 1.0
        vcvtdq2ps  ymm7,ymm7
     ;   movaps    xmm1,.xf
        vsubps     xmm3,xmm3,xmm1 ;[xf]
     ;   cvtdq2ps  xmm6,xmm6
        vmovlhps   xmm3,xmm3,xmm1 ;[xf]
        vcvtdq2ps  ymm6,ymm6
        vmovaps    xmm1,xmm3  ; 1-xf, 1-yf, xf, yf
        vshufps    xmm3,xmm3,xmm3,10001000b
        vshufps    xmm1,xmm1,xmm1,11110101b
        vmulps     xmm3,xmm3,xmm1
        vperm2f128 ymm3,ymm3,ymm3,00100000b
        vmulps     ymm6,ymm6,ymm3

        vmulps     ymm7,ymm7,ymm3
        vhaddps    ymm7,ymm7,ymm7
        vhaddps    ymm6,ymm6,ymm6
        vhaddps    ymm7,ymm7,ymm7
        vhaddps    ymm6,ymm6,ymm6
        vperm2f128 ymm7,ymm7,ymm7,00000010b
        vmovlhps   xmm7,xmm7,xmm6

ret
;=====================================================================
end if
